#!/usr/bin/python3

# Copyright (c) 2023, University of Michigan.  All rights reserved.
#
# Developed by: Freddolino Lab
#              University of Michigan Department of Biological Chemsitry
#              petefred@umich.edu
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal with
# the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# of the Software, and to permit persons to whom the Software is furnished to
# do so, subject to the following conditions:
# * Redistributions of source code must retain the above copyright notice,
#   this list of conditions and the following disclaimers.
# * Redistributions in binary form must reproduce the above copyright notice,
#   this list of conditions and the following disclaimers in the documentation
#   and/or other materials provided with the distribution.
# * Neither the names of the Freddolino lab, the University of Michigan,
#   nor the names of its contributors may be used to endorse or promote products
#   derived from this Software without specific prior written permission.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
# CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
# SOFTWARE.

# This script will execute a simple practical equivalence test on expression data (typically from DESeq2 or similar pipelines)
# The challenge that it is designed to address is that when analyzing RNAseq data, for genes with high q values,
#   it is often not clear whether the gene is really not changing in expression, or whether there is simply too much noise
#   to detect the change (indeed, this is an under-appreciated interpretive problem with all frequentist statistical tests)
# To address that problem, we perform a separate statistical test of the null hypothesis that the log2 fold change (log2FC)
#   for each transcript of interest is GREATER in magnitude than a specified region of practical equivalence (ROPE)
# If the null hypothesis is rejected, then we have some evidence that the expression of the gene is practically
#   equivalent in the two conditions being compared
# The test makes the approximation that under the null distribution, the log2fc of each transcript is normally distributed
#   with mean equal to the ROPE boundary and standard deviation equal to the standard error of the log2fc from DESeq2

# Usage: calc_sig_unchanged.py <infile> <outfile> <ROPE>
# Command line arguments:
#   infile -- Name of the input file. Should be a tab separated file with transcript names in the first column, one column named 
#                 log2FoldChange, and one column named SE
#   outfile -- Name for the output file, which will have columns added for the p values and q values of the practical equivalence test
#   ROPE -- size of the region of practical equivalence (around 0). Larger values make it easier to call expression as practically equivalent across
#           conditions. Should be chosen such that you're confident in saying that a log2fc is biologically meaningless if it falls within the ROPE

# Requirements: python3 installation with the pandas, scipy, and statsmodels libraries

import pandas
import scipy.stats
import sys
import statsmodels.stats.multitest

infile=sys.argv[1]
outfile=sys.argv[2]
ROPE=float(sys.argv[3])

dattab=pandas.read_table(infile, sep="\t")
print(dattab.head())
dattab.dropna(how='all', inplace=True, axis=1)
dattab.dropna(inplace=True)

pvals_unch = []
for obs_val,obs_sd in zip( dattab.log2FoldChange.values, dattab.SE.values):
    abs_obs = abs(obs_val)
    pval = scipy.stats.norm.cdf( abs_obs, loc=ROPE, scale=obs_sd)
    pvals_unch.append(pval)

q_reject, qvals_unch, alpha1, alpha2 = statsmodels.stats.multitest.multipletests(pvals_unch, method='fdr_bh')
print(qvals_unch)
dattab['pvals_unch'] = pvals_unch
dattab['qvals_unch'] = qvals_unch

dattab.to_csv(outfile, sep="\t", index=False)
